# load packages
if (!requireNamespace("pacman", quietly = TRUE)) install.packages("pacman")
pacman::p_load(readr,cowplot, here, janitor, tidyverse, lubridate, psych, knitr, kableExtra, apaTables, arsenal, papaja, tidytext, stm, ggpubr, tm, ggrepel, ggcorrplot, qgraph, mgcv, ggpattern)

## ------------------------------
# the process of build STM Model
## ------------------------------
# for time saving, we have saved the model for faster loading
# please feel free to run the code below to build the model by yourself

# bloc_stm <- read_csv("data/bloc.csv")
# temp <- textProcessor(
# documents = bloc_stm$English_segment, 
# metadata = bloc_stm[c('Name_of_Media', 'date', 'Side')],
# lowercase = TRUE,
# removestopwords = TRUE,
# removenumbers = TRUE,
# removepunctuation = TRUE,
# stem = FALSE
# )
# out <- prepDocuments(temp$documents, temp$vocab, temp$meta, lower.thresh = 50)
# docs <- out$documents 
# vocab <- out$vocab
# meta <- out$meta
# save(temp, out, docs, vocab, meta, file = "data/stmmodel.RData")

## ------------------------------
# the process of searching for the optimal number of topics
## ------------------------------
# for time saving, we have aslo saved the results for faster loading
#search for the number of topics
#search <- searchK(out$documents,
#                    out$vocab,
#                    data = out$meta,
#                    prevalence = ~ Side + Name_of_Media,
#                    max.em.its = 100,
#                    init.type = "Spectral",
#                    K = c(5:30),
#                    verbose = TRUE, seed = 3223212)

# save K_search result
#saveRDS(K_search, "data/K_search.rds")

## ------------------------------
# the process of fitting STM model with K=10
## ------------------------------
# for time saving, we have saved the results for faster loading
# conduct topic modeling based on K=10
#set.seed(3223212)
#stm_10 <- stm(out$documents,
#              out$vocab,
#              data = out$meta,
#              init.type = 'Spectral',
#              K = 10,
#              verbose = FALSE)
# save STM k=10 result
#saveRDS(stm_10, file = "data/stm_10.rds")



################################################
# load stm model
load("data/stmmodel.RData")
## ------------------------------
# figure 1
## ------------------------------
stm_10 <- readRDS("data/stm_10.rds")
topic_labels <- labelTopics(stm_10, n = 7)
frex_df <- data.frame(topic = factor(1:10), 
                      frex_words = apply(topic_labels$frex, 1, function(x) paste(x, collapse = ", ")))
out$meta$doc_id <- 1:nrow(out$meta)
stm_10$meta = out$meta
stm_10_beta <- tidy(stm_10, matrix = "beta") %>% 
  group_by(topic) %>% 
  slice_max(beta, n = 5000, with_ties = FALSE) %>% 
  arrange(topic, desc(beta), .by_group = TRUE) %>% 
  mutate(topic = as_factor(topic))
stm_10_gamma <- tidy(stm_10, matrix = "gamma") %>%
  mutate(doc_id = 1:n()) %>%
  right_join(out$meta, by = c('document' = 'doc_id')) %>%
  group_by(topic) %>%
  arrange(topic, desc(gamma)) %>%
  mutate(topic = as_factor(topic))
stm_10_beta_top_terms <- stm_10_beta %>%
  group_by(topic) %>%
  slice_max(beta, n = 5) %>%
  select(topic, term, beta) %>%
  summarise(terms = list(term)) %>%
  mutate(top_terms = map(terms, paste, collapse = ", "), topic = as_factor(topic)) %>% 
  unnest(cols = top_terms) %>% 
  select(topic, top_terms)
stm_10_tidy_bg <- stm_10_gamma %>%
  group_by(topic) %>%
  summarise(gamma = mean(gamma, na.rm = TRUE)) %>%
  left_join(frex_df, by = "topic") %>%
  mutate(topic = factor(topic))

topic_distribution_by_side <- stm_10_gamma %>%
  group_by(Side, topic) %>%
  summarise(avg_gamma = mean(gamma, na.rm = TRUE), .groups = "drop") %>%
  group_by(Side) %>%
  mutate(percentage = avg_gamma / sum(avg_gamma) * 100)


topic_labels <- c("Topic 1 Global Inflation and Pandemic", 
                  "Topic 2 Western Partisan Politics", 
                  "Topic 3 BRICS and International Cooperation", 
                  "Topic 4 Taiwan Sovereignty Issue", 
                  "Topic 5 Russia-Ukraine War", 
                  "Topic 6 Cultural and Technical Exchange", 
                  "Topic 7 Disasters and Accidents", 
                  "Topic 8 Sanction and Finance", 
                  "Topic 9 NATO Expansion", 
                  "Topic 10 Energy Trade and Price")

topic_distribution_by_side$topic_label <- factor(topic_labels[as.numeric(str_remove(topic_distribution_by_side$topic, "Topic "))], levels = topic_labels)

colors <- c("China" = "#5773CCFF", "Russia" = "#FFB900FF")

topic_side <- ggplot(topic_distribution_by_side, aes(x = topic_label, y = percentage, fill = Side)) +
  geom_bar(stat = "identity", position = "dodge", show.legend = TRUE, width = 0.6) +  
  scale_fill_manual(values = colors) +  
  labs(y = "Percentage", x = "Topic", title = "Topic Distribution by Side") +
  theme_minimal() +
  theme(legend.position = "bottom", axis.text.x = element_text(angle = 75, hjust = 1)) +  
  guides(fill = guide_legend(nrow = 1))

ggsave("fig_paper/figure1.png", plot = topic_side, width = 10, height = 8, dpi = 300)


## ------------------------------
# paper figure2
## ------------------------------
# estimate side effect on topic prevelance
# update the model with date as numeric variable
# for time saving, we have saved the results for faster loading
out$meta$date_numeric <- as.numeric(difftime(out$meta$date, min(out$meta$date), units = "days"))
#stm_10_updated <- stm(out$documents, 
#                      out$vocab,
#                      data = out$meta,
#                      prevalence =~ Side + date_numeric,
#                      init.type = 'Spectral',
#                      K = 10,
#                      verbose = FALSE)
#saveRDS(stm_10_updated, file = "data/stm_10_updated.rds")
stm_10_updated <- readRDS("data/stm_10_updated.rds")
prepeff <- estimateEffect(1:10 ~ Side + s(date_numeric), stm_10_updated,
                          meta = out$meta, uncertainty = "Global")

plot(prepeff, covariate = "Side", topics = c(1,2,3,4,5,6,7,8,9,10),
     model = stm_10_updated, method = "difference", cov.value1 = "Russia",
     cov.value2 = "China",
     xlab = "Russia ... China",
     main = "Effect of Russia vs. China", xlim = c(-0.4, 0.3))

png("fig_paper/figure2.png", width=1000, height=700, res=100)
plot(prepeff, covariate = "Side", topics = c(9,2,8,7,5,10,6,1,4,3),
     model = stm_10_updated, method = "difference", cov.value1 = "Russia",
     cov.value2 = "China",
     xlab = "China ... Russia",
     xlim = c(-0.5, 0.3),
     cex = 0.1,
     labeltype = "custom", custom.labels = c("Topic 9 NATO Expansion",
                                             "Topic 2 Western Partisan Politics",
                                             "Topic 8 Sanction and Finance",
                                             "Topic 7 Disasters and Accidents" ,
                                             "Topic 5 Russia-Ukraine War",
                                             "Topic 10 Energy Trade and Price",
                                             "Topic 6 Cultural and Technical Exchange",
                                             "Topic 1 Global Inflation and Pandemic",
                                             "Topic 4 Taiwan Sovereignty Issue",
                                             "Topic 3 BRICS and International Cooperation"))
dev.off()




## ------------------------------
## Figure Appendix  A3_a
# plot of K_search indicators
## ------------------------------

# load the K_search results
K_search <- readRDS("data/K_search.rds")
plot.searchK <- function(x, ...){
  oldpar <- par(no.readonly=TRUE)
  g <- x$results
  par(mfrow=c(2,2),mar=c(2,2,2,2),oma=c(0,0,2,0), cex=3.5)
  
  plot(g$K,g$heldout,type="p", main="Held-out likelihood", xlab="", ylab="")
  lines(g$K,g$heldout,lty=2,col=2)
  
  plot(g$K,g$residual,type="p", main="Residuals", xlab="", ylab="")
  lines(g$K,g$residual,lty=2,col=2 )
  
  if(!is.null(g$semcoh)){
    plot(g$K,g$semcoh,type="p", main="Semantic coherence", xlab="", ylab="")
    lines(g$K,g$semcoh,lty=2,col=2 )
  }
  
  plot(g$K,g$lbound,type="p", main="Lower bound", xlab="", ylab="")
  lines(g$K,g$lbound,lty=2,col=2 )
  
  par(oldpar)
}

png("fig_appendix/A3_a.png", width = 1800, height = 1200)
plot.searchK(K_search)
dev.off() 

## ------------------------------
## Figure Appendix  A3_b
## ------------------------------
K_search$results %>% 
  as.data.frame() %>% 
  mutate(across(c(exclus, semcoh), as.numeric)) %>% 
  ggplot(aes(x = semcoh, y = exclus)) +
  geom_point() +
  geom_label_repel(aes(label = K), max.overlaps = Inf) +  
  labs(x = "Semantic Coherence", 
       y = "Exclusivity") +
  theme_pubr()

ggsave(file = "fig_appendix/A3_b.jpeg", width = 5, height = 5)



## ------------------------------
# figure appendix A8
## ------------------------------
# load STM k=10 result
topic_labels <- labelTopics(stm_10, n = 7)

frex_df <- data.frame(topic = factor(1:10), 
                      frex_words = apply(topic_labels$frex, 1, function(x) paste(x, collapse = ", ")))

#use tidyverse to plot topics
out$meta$doc_id <- 1:nrow(out$meta)
stm_10$meta = out$meta
# get beta
stm_10_beta <- tidy(stm_10, matrix = "beta") %>% 
  group_by(topic) %>% 
  slice_max(beta, n = 5000, with_ties = FALSE) %>% 
  arrange(topic, desc(beta), .by_group = TRUE) %>% 
  mutate(topic = as_factor(topic))

stm_10_gamma <- tidy(stm_10, matrix = "gamma") %>%
  mutate(doc_id = 1:n()) %>%
  right_join(out$meta, by = c('document' = 'doc_id')) %>%
  group_by(topic) %>%
  arrange(topic, desc(gamma)) %>%
  mutate(topic = as_factor(topic))

stm_10_beta_top_terms <- stm_10_beta %>%
  group_by(topic) %>%
  slice_max(beta, n = 5) %>%
  select(topic, term, beta) %>%
  summarise(terms = list(term)) %>%
  mutate(top_terms = map(terms, paste, collapse = ", "), topic = as_factor(topic)) %>% 
  unnest(cols = top_terms) %>% 
  select(topic, top_terms)

stm_10_tidy_bg <- stm_10_gamma %>%
  group_by(topic) %>%
  summarise(gamma = mean(gamma, na.rm = TRUE)) %>%
  left_join(frex_df, by = "topic") %>%
  mutate(topic = factor(topic))

topic_p = stm_10_tidy_bg %>%
  mutate(topic = reorder(topic, gamma)) %>%
  ggplot(aes(x = topic, y = gamma)) +
  geom_segment(aes(xend = topic, yend = 0.04)) +
  geom_point(size = 3) +
  labs(
    title = "Topics with their Average Proportions and Top FREX Words",
    subtitle = "Top 7 FREX terms per topic",
    x = "Topic",
    y = "Average Proportion"
  ) +
  geom_text(aes(label = frex_words), hjust = -0.1, size = 4, check_overlap = TRUE) +
  theme_bw() +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(),
    axis.title.x = element_text(),
    axis.title.y = element_text(),
    axis.text.x = element_text(),
    axis.text.y = element_text()
  ) +
  coord_flip(ylim = c(0.04, 0.5))

ggsave("fig_appendix/A8.png", topic_p, width = 10, height = 10, units = "in")



## ------------------------------
# figure appendix A5
## ------------------------------
topic_proportion_by_date_and_side <- stm_10_gamma %>%
  group_by(date, Side, topic) %>%
  summarise(avg_gamma = mean(gamma, na.rm = TRUE), .groups = 'drop') %>%
  group_by(date, Side) %>%
  mutate(total_gamma = sum(avg_gamma, na.rm = TRUE)) %>%
  ungroup() %>%
  mutate(percentage = (avg_gamma / total_gamma) * 100)

colors <- c("#9e0142", "#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#66c2a5", "#3288bd", "#5e4fa2")
topics <- c("Topic 1 Global Inflation and Pandemic",
            "Topic 2 Western Partisan Politics",
            "Topic 3 BRICS and International Cooperation",
            "Topic 4 Taiwan Sovereignty Issue",
            "Topic 5 Russia-Ukraine War",
            "Topic 6 Cultural and Technical Exchange",
            "Topic 7 Disasters and Accidents",
            "Topic 8 Sanction and Finance",
            "Topic 9 NATO Expansion",
            "Topic 10 Energy Trade and Price")

plots_list <- list()

for(side in unique(topic_proportion_by_date_and_side$Side)) {
  side_data <- topic_proportion_by_date_and_side %>%
    filter(Side == side) %>%
    mutate(topic = factor(topic, levels = 1:10, labels = topics)) %>%
    pivot_longer(cols = "percentage", names_to = "variable", values_to = "value")

  p <- ggplot(side_data, aes(x = date, y = value, fill = topic)) +
    geom_area(alpha = 0.6, position = "stack") +
    scale_y_continuous(labels = scales::percent_format(scale = 1)) +
    scale_fill_manual(values = colors, name = "Topics") +
    labs(title = paste("Expected Topic Proportion Overtime:", side),
         x = "Date", y = "Expected Topic Proportion (%)") +
    theme_minimal()
  
  plots_list[[side]] <- p
}

topic_overtime <- plot_grid(plotlist = plots_list, ncol = 1)
ggsave("fig_appendix/A5.png", topic_overtime, width = 15, height = 10, units = "in")

## ------------------------------
# figure appendix A6
## ------------------------------
# load the stm object based on the data how Russia and China talk about each other
load("data/each_other.RData")
# load each other stm model where k=10
eachotherstm10 <- readRDS("data/each_other_stm10.rds")
out$meta$date_numeric <- as.numeric(difftime(out$meta$date, min(out$meta$date), units = "days"))

prepeff <- estimateEffect(1:10 ~ Side + s(date_numeric), eachotherstm10,
                          meta = out$meta, uncertainty = "Global")


png("fig_appendix/A6.png", width=1000, height=700, res=100)
plot(prepeff, covariate = "Side", topics = c(9,4,8,6,1,5,2,10,7,3),
     model = eachotherstm10, method = "difference", cov.value1 = "Russia",
     cov.value2 = "China",
     xlab = "China ... Russia",
     xlim = c(-0.3, 0.3),
     cex = 0.1,
     labeltype = "custom", custom.labels = c("Topic 9 Bilateral Economic Cooperation",
                                             "Topic 4 US Containment",
                                             "Topic 8 Russia Diplomatic Outreach",
                                             "Topic 6 Military and Defense",
                                             "Topic 1 Taiwan Sovereignty Issue",
                                             "Topic 5 Bilateral High_level Cooperation",
                                             "Topic 2 Energy and Inflation",
                                             "Topic 10 BRICS and Cooperation",
                                             "Topic 7 NATO Confrontation",
                                             "Topic 3 Russia-Ukraine War"))
dev.off()



